Data Import

concrete = read_csv("concrete.csv") %>%
  janitor::clean_names()
## Parsed with column specification:
## cols(
##   Cement = col_double(),
##   BlastFurnaceSlag = col_double(),
##   FlyAsh = col_double(),
##   Water = col_double(),
##   Superplasticizer = col_double(),
##   CoarseAggregate = col_double(),
##   FineAggregate = col_double(),
##   Age = col_integer(),
##   CompressiveStrength = col_double()
## )

Part A

# matrix of predictors 
x <- model.matrix(compressive_strength~.,concrete)[,-1]
# vector of response
y <- concrete$compressive_strength
theme1 <- trellis.par.get()
theme1$plot.symbol$col <- rgb(.2, .4, .2, .5)
theme1$plot.symbol$pch <- 16
theme1$plot.line$col <- rgb(.8, .1, .1, 1)
theme1$plot.line$lwd <- 2
theme1$strip.background$col <- rgb(.0, .2, .6, .2)
trellis.par.set(theme1)
featurePlot(x, y, plot = "scatter", labels = c("","compressive strength"),
            type = c("p"), layout = c(4, 2))

Part B

#polynomial regression
fit1 <- lm(compressive_strength~water, data = concrete)
fit2 <- lm(compressive_strength~poly(water,2), data = concrete) 
fit3 <- lm(compressive_strength~poly(water,3), data = concrete) 
fit4 <- lm(compressive_strength~poly(water,4), data = concrete) 
set.seed(1)

mseK <- rep(NA, 4)
for (i in 1:4) {
    fit <- glm(compressive_strength ~ poly(water, i), data = concrete)
    mseK[i] <- cv.glm(concrete, fit, K = 10)$delta[1]
}

plot(1:4, mseK, xlab = "Exponent", ylab = "Test MSE", type = "l")

The cross validation approach produced models with very similar RMSE values. While models with the exponent of 2 and 3 for water are the lowest they aren’t by much. With that being said, the r squared value for the exponent 4 is the highest indicating that the differences in compressive_strength is more explained by water^4 than any other polynomial function. Using the for loop, the cross validation provided an exponent of 4 as the best polynomial function. Therefore lets confirm using the anova test.

anova(fit1,fit2,fit3,fit4) 
## Analysis of Variance Table
## 
## Model 1: compressive_strength ~ water
## Model 2: compressive_strength ~ poly(water, 2)
## Model 3: compressive_strength ~ poly(water, 3)
## Model 4: compressive_strength ~ poly(water, 4)
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   1028 263085                                  
## 2   1027 247712  1   15372.8 68.140 4.652e-16 ***
## 3   1026 235538  1   12174.0 53.962 4.166e-13 ***
## 4   1025 231246  1    4291.5 19.022 1.423e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot1 <- ggplot(data = concrete, aes(x = water, y = compressive_strength)) +
     geom_point(color = rgb(.2, .4, .2, .5))
plot1

Using the anova test, models with an exponent of 2, 3, and 4 are significant. However, lets use d=2 since we want the most parsimonious model.

plot(compressive_strength ~ water, data = concrete, col = "blue")
waterlims <- range(concrete$water)
water.grid <- seq(from = waterlims[1], to = waterlims[2], by = 1)

preds1 <- predict(fit1, newdata = data.frame(water = water.grid))
preds2 <- predict(fit2, newdata = data.frame(water = water.grid))
preds3 <- predict(fit3, newdata = data.frame(water = water.grid))
preds4 <- predict(fit4, newdata = data.frame(water = water.grid))

lines(water.grid, preds1, col = "red", lwd = 2)
lines(water.grid, preds2, col = "purple", lwd = 2)
lines(water.grid, preds3, col = "green", lwd = 2)
lines(water.grid, preds4, col = "brown", lwd = 2)

Part C

plot(compressive_strength ~ water, data = concrete, col = "red")

waterlims <- range(concrete$water)
water.grid <- seq(from = waterlims[1], to = waterlims[2], by = 1)
fit <- lm(compressive_strength ~ poly(water, 4), data = concrete)
preds <- predict(fit, newdata = data.frame(water = water.grid))
lines(water.grid, preds, col = "blue", lwd = 2)

fit.ss <- smooth.spline(concrete$water, concrete$compressive_strength)
fit.ss$df
## [1] 68.88205
pred.ss <- predict(fit.ss, x = water.grid)
pred.ss.df <- data.frame(pred = pred.ss$y,
                         water = water.grid)

p <- ggplot(data = concrete, aes(x = water, y = compressive_strength)) +
  geom_point(color = rgb(.2, .4, .2, .5))
p + geom_line(aes(x = water, y = pred), data = pred.ss.df, 
              color = rgb(.8, .1, .1, 1)) + theme_bw() + 
  labs(title = 'Select degrees of freedom using GCV') + 
  theme(plot.title = element_text(hjust = 0.5))

Here we will fit for different degrees of freedom for 2 to 10.

par(mfrow = c(3,3)) # 3 x 3 grid
all.dfs = rep(NA, 9)
for (i in 2:10) {
  fit.ss = smooth.spline(concrete$water, concrete$compressive_strength, df = i)
  
  pred.ss <- predict(fit.ss, x = water.grid)
  
  plot(concrete$water, concrete$compressive_strength, cex = .5, col = "darkgrey")
  title(paste("Degrees of freedom = ", round(fit.ss$df)),  outer = F)
  lines(water.grid, pred.ss$y, lwd = 2, col = "blue")
}

#Use cross validation to find degrees of freedom 
fit.ss <- smooth.spline(concrete$water, concrete$compressive_strength)
fit.ss$df
## [1] 68.88205
pred.smooth <- predict(fit.ss, x = water.grid)
pred.smooth.df <- data.frame(pred = pred.ss$y,
                         water = water.grid)

fitplot <- ggplot(data = concrete, aes(x = water, y = compressive_strength)) +
  geom_point(color = rgb(.2, .2, .4, .1)) + geom_line(aes(x = water, y = pred), data = pred.smooth.df, 
              color = rgb(.8, .1, .1, 1)) + theme_bw() 
fitplot

#fit 1
fit.ss1 <- smooth.spline(concrete$water, concrete$compressive_strength, df = 10)
fit.ss1
## Call:
## smooth.spline(x = concrete$water, y = concrete$compressive_strength, 
##     df = 10)
## 
## Smoothing Parameter  spar= 0.8687676  lambda= 0.001391258 (11 iterations)
## Equivalent Degrees of Freedom (Df): 10.00132
## Penalized Criterion (RSS): 81156.22
## GCV: 221.8629
pred.ss1 <- predict(fit.ss1, x = water.grid)
pred.ss.df1 <- data.frame(pred = pred.ss1$y, water = water.grid)
#fit 2
fit.ss2 <- smooth.spline(concrete$water, concrete$compressive_strength, df = 30)
fit.ss2
## Call:
## smooth.spline(x = concrete$water, y = concrete$compressive_strength, 
##     df = 30)
## 
## Smoothing Parameter  spar= 0.5500421  lambda= 6.929549e-06 (11 iterations)
## Equivalent Degrees of Freedom (Df): 30.00244
## Penalized Criterion (RSS): 59933.93
## GCV: 208.9676
pred.ss2 <- predict(fit.ss2, x = water.grid)
pred.ss.df2 <- data.frame(pred = pred.ss2$y, water = water.grid)
#fit 3
fit.ss3 <- smooth.spline(concrete$water, concrete$compressive_strength, df = 50)
fit.ss3
## Call:
## smooth.spline(x = concrete$water, y = concrete$compressive_strength, 
##     df = 50)
## 
## Smoothing Parameter  spar= 0.3836017  lambda= 4.347305e-07 (11 iterations)
## Equivalent Degrees of Freedom (Df): 50.00509
## Penalized Criterion (RSS): 46068.79
## GCV: 202.715
pred.ss3 <- predict(fit.ss3, x = water.grid)
pred.ss.df3 <- data.frame(pred = pred.ss3$y, water = water.grid)
#cv fit
fit.ss <- smooth.spline(concrete$water, concrete$compressive_strength)
fit.ss
## Call:
## smooth.spline(x = concrete$water, y = concrete$compressive_strength)
## 
## Smoothing Parameter  spar= 0.2578404  lambda= 5.371698e-08 (11 iterations)
## Equivalent Degrees of Freedom (Df): 68.88205
## Penalized Criterion (RSS): 36681.64
## GCV: 200.2892
pred.ss <- predict(fit.ss, x = water.grid)
pred.ss.df <- data.frame(pred = pred.ss$y, water = water.grid)
#plot of fits
scatter <- ggplot(data = concrete, aes(x = water, y = compressive_strength)) +
  geom_point(color = rgb(.2, .4, .2, .5))
smooth_fits <- scatter + 
  geom_line(aes(x = water, y = pred), data = pred.ss.df1,
          color = rgb(.8, .1, .1, 1)) + 
  geom_line(aes(x = water, y = pred), data = pred.ss.df2,
          color = rgb(0, 0, 1, 1)) + 
  geom_line(aes(x = water, y = pred), data = pred.ss.df3,
          color = rgb(1, 0, 1, 1)) + 
  geom_line(aes(x = water, y = pred), data = pred.ss.df,
          color = rgb(.2, .2, .4, 1)) + 
  theme_bw()
smooth_fits

Part D

gam.m1 <- gam(compressive_strength ~ cement + blast_furnace_slag + fly_ash + water + superplasticizer + coarse_aggregate + fine_aggregate + age, data = concrete)

gam.m2 <- gam(compressive_strength ~ cement + blast_furnace_slag + fly_ash + s(water) + superplasticizer + coarse_aggregate + fine_aggregate + age, data = concrete)

anova(gam.m1, gam.m2, test = "F")
## Analysis of Deviance Table
## 
## Model 1: compressive_strength ~ cement + blast_furnace_slag + fly_ash + 
##     water + superplasticizer + coarse_aggregate + fine_aggregate + 
##     age
## Model 2: compressive_strength ~ cement + blast_furnace_slag + fly_ash + 
##     s(water) + superplasticizer + coarse_aggregate + fine_aggregate + 
##     age
##   Resid. Df Resid. Dev     Df Deviance      F   Pr(>F)    
## 1    1021.0     110413                                    
## 2    1013.4     106140 7.5562   4272.8 5.4038 2.01e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(gam.m2)

vis.gam(gam.m2, view = c("water", "fine_aggregate"), 
        plot.type = "contour", color = "topo")

vis.gam(gam.m2, view = c("water","coarse_aggregate"),
        plot.type = "contour", color = "topo")